from pathlib import Path
from pyarrow import parquet


def get_malay():
    fw = open('/root/autodl-tmp/malay/malay_vocab.txt', 'w', encoding='utf-8', newline='')
    data_dir = r'/root/autodl-tmp/malay'
    for parquet_file in Path(data_dir).rglob("*.parquet"):
        print(parquet_file)
        pq = parquet.ParquetFile(parquet_file)
        df = pq.read().to_pandas()
        for idx, row in df.iterrows():
            fw.write(row['src_text'] + '\n')
    fw.close()


def get_chinese():
    fw = open('/root/autodl-tmp/malay/chinese_vocab.txt', 'w', encoding='utf-8', newline='')
    data_dir = r'/root/autodl-tmp/malay'
    for parquet_file in Path(data_dir).rglob("*.parquet"):
        print(parquet_file)
        pq = parquet.ParquetFile(parquet_file)
        df = pq.read().to_pandas()
        for idx, row in df.iterrows():
            fw.write(row['trans_text'] + '\n')
    fw.close()

get_malay()
get_chinese()
